Thu Jun 19 16:51:53 2025

Import the Seurat object: previously processed, annotated with cell labels, assigned gene-set scores using AUCell (cancersea state signature scores), and cells that contain EGFP plasmid fragments have been labeled.

seu <- readRDS(seurat_file)
# simplify cell type labels (group rare ones into "other")
seu$celltype <- ifelse(seu$celltype %in% names(which(table(seu$celltype) < 100)), 
                        'other', seu$celltype)
# update group labels 
conditions <- list('PoolA' = 'Recovery', 
               'PoolB' = 'DSS', 
               'PoolC' = 'Untreated')
seu$condition <- paste(conditions[seu$sample])
egfp_cells <- colnames(seu)[seu$egfp_status == 'EGFP_POS']

1 EGFP cells

From the single-cell sequencing of samples under 3 conditions, we managed to pick up about 1.5% of cells to contain at least one unique read originating from the EGFP-plasmid. Those cells are supposed to contain NFKB promoters thus, EGFP-positive cells are likely to contain increased NFKB activity.

knitr::kable(table(seu$egfp_status, seu$condition), 
caption = 'Number of cells per condition categorized by EGFP status')
Number of cells per condition categorized by EGFP status
DSS Recovery Untreated
EGFP_NEG 5257 7153 6561
EGFP_POS 76 116 116

2 UMAPs

Note: Black dots represent cells for which we could pick up at least one EGFP-plasmid fragment.

2.1 celltype

p <- DimPlot(seu, group.by = 'celltype')
df <- p[[1]]$data
p +  geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2), 
             color = 'black', size = 1)

2.2 condition

p <- DimPlot(seu, group.by = 'condition')
df <- p[[1]]$data
p +  geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2), 
             color = 'black', size = 1)

2.3 celltype + condition

p <- DimPlot(seu, group.by = 'celltype', split.by = 'condition')
df <- p[[1]]$data
p +  geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2), 
                color = 'black', size = 1)

3 CancerSEA signatures

Comparison of cancer state signature scores in egfp+ and egfp- cells in different conditions

3.1 By EGFP status

sigs <- colnames(seu@meta.data)[10:23]
dt <- data.table(seu@meta.data)

mdt <- melt.data.table(dt, measure.vars = sigs)

ggplot(mdt, aes(x = value, y = variable)) + 
  geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
  scale_fill_manual(values = c('blue', 'red')) + 
  labs(y = 'Signature', x = 'AUCell score')

3.2 By condition

ggplot(mdt, aes(x = value, y = variable)) + 
  geom_density_ridges(aes(fill = condition), alpha = 0.5) +
  labs(y = 'Signature', x = 'AUCell score') 

3.3 By condition + egfp status

ggplot(mdt, aes(x = value, y = variable)) + 
  geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
  scale_fill_manual(values = c('blue', 'red')) + 
  labs(y = 'Signature', x = 'AUCell score') +
  facet_grid(~ condition)

4 Marker Analysis - Epithelial Cells

Now, we focus on epithelial cells (cells annotated as epithelial and also are cdh1+) to look for NKFB activity signatures by comparing EGFP+ cells iwth EGFP- cells in different conditions.

4.1 Markers to cross-check with qPCR data

For each gene, we check ratio of epithelial cells expressing the marker in different conditions; split by egfp status

qPCR_markers <- c('Egfp', 'Lgr5', 'Chga', 'Muc2', 'Dclk1', 'Tnfaip3', 'Noxa1', 
                  'Tnf', 'Bcl2l1', 'Bcl2')
M <- GetAssayData(seu)
epithelial_cells <- intersect(colnames(seu)[seu$celltype == 'Epithelial cells'], 
                                      names(which(M['Cdh1',] > 0)))

#  setdiff(intersect(colnames(seu)[seu$celltype == 'Epithelial cells'], 
 #                                     names(which(M['Cdh1',] > 0))),
  #                          names(which(M['Ptprc',] > 0)))
  
seu_epi <- seu[,epithelial_cells]

dt <- data.table(seu_epi@meta.data, keep.rownames = T)
dt <- cbind(dt, sapply(qPCR_markers, function(x) {
  M[x, dt$rn]
}))

df <- do.call(rbind, lapply(qPCR_markers, function(x) {
  s <- dt[,list('percent_expressed' = round(sum(get(x) > 0)/length(get(x)) * 100, 1), 
          'avg_expression' = mean(get(x))),by = c('condition', 'egfp_status')]
  s$gene <- x
  return(s)
}))

df$egfp_status <- factor(df$egfp_status, levels = c('EGFP_POS', 'EGFP_NEG'))
df$condition <- factor(df$condition, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = egfp_status, y = percent_expressed)) + 
  geom_bar(stat = 'identity', aes(fill = condition), position = 'dodge') + 
  facet_wrap(~ gene, scales = 'free', nrow = 2) + 
  scale_fill_brewer(type = 'qual', palette = 6)

4.2 Dot Plots of Markers

4.2.1 Top markers

# find markers for egfp+/egpf- in cdh1+ epithelial cells 
markers <- sapply(simplify = F, unique(seu_epi$condition), function(x) {
  message(date(), " => computing markers for condition ",x)
  dt <- data.table(FindMarkers(seu_epi[,seu_epi$condition == x], 
                               min.pct = 0.1, 
                               logFoldChange = 0.25,
                         test.use = 'wilcox', 
                         ident.1 = 'EGFP_POS',
                         ident.2 = 'EGFP_NEG',
                         only.pos = TRUE, 
                         group.by = 'egfp_status'), 
             keep.rownames = T)
  dt$condition <- x
  return(dt)
})
markers <- do.call(rbind, markers) 

m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:21], by = .(condition)][!is.na(rn)][rn != 'Egfp']

p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'], 
             features = unique(m$rn), 
      group.by = 'condition', scale = F) + coord_flip()

df <- p$data

df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))

ggplot(df, aes(x = features.plot, y = id)) + 
  geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
  labs(color = "Average\nExpression", size = "Percent\nExpression") + 
#  scale_color_gradient(low = '#672976', high = 'yellow') +  
  scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) + 
  scale_size(range = c(0, 6)) + 
  theme(axis.title = element_blank(), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'), 
        axis.text = element_text(size = 14, family = 'Arial'),
        legend.title = element_text(family = 'Arial'))

4.2.2 Subset of top markers

# subset of genes of interest
# I don't know how these are selected by Marina
goi <- c('Steap4', 'Smim19', 'Cox17', 'Nfkbib', 'Man2a2', 'Sept10', 'Acot13', 'Ndufv2', 'Mrps15', 'Aqp8', 'H2-Eb1', 'Ly6g', 'Atf3')

p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'], 
             features = goi, 
      group.by = 'condition', scale = F) + coord_flip()

df <- p$data

df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))

ggplot(df, aes(x = features.plot, y = id)) + 
  geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
  labs(color = "Average\nExpression", size = "Percent\nExpression") + 
  scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) +  
  scale_size(range = c(0, 6)) + 
  theme(axis.title = element_blank(), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'), 
        axis.text = element_text(size = 14, family = 'Arial'),
        legend.title = element_text(family = 'Arial'))

4.3 Heatmaps

Found top markers expressed in at least 15% of EGFP+ cells and filtered for padj < 0.1

4.3.1 Average Marker Expression

colorPal <- colorRampPalette(c("darkorchid4", "yellow"))(50)
M <- GetAssayData(seu_epi)

#top <- markers[pct.1 > 0.15][p_val_adj < 0.1]

top <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:20], by = .(condition)][!is.na(rn)]

B <- get_basis_matrix(t(as.matrix(M[unique(top$rn),])), 
                      as.factor(paste(seu_epi$condition, seu_epi$egfp_status)))

sample_order <- c('Untreated EGFP_POS', 'Untreated EGFP_NEG', 
                  'DSS EGFP_POS', 'DSS EGFP_NEG', 
                  'Recovery EGFP_POS', 'Recovery EGFP_NEG')

pheatmap::pheatmap(t(B[sample_order,]), scale = 'row', cluster_cols = F, 
                   cellwidth = 20, fontsize_row = 9, color = colorPal, 
                   gaps_col = c(2,4))

4.3.2 Ratio of cells expressing the marker

# import nfkb target genes; (for marker analysis )
nfkb <- readLines(file.path(datadir, 'nfkb_targets_mouse.txt'))
dt <- melt.data.table(markers, id.vars = c('rn', 'condition'), measure.vars = c('pct.1', 'pct.2'))
dt$variable <- ifelse(dt$variable == 'pct.1', 'EGFP_POS', 'EGFP_NEG')
dt$group <- paste(dt$condition, dt$variable)
dtc <- dcast.data.table(dt, rn ~ group, value.var = 'value')
dtc[is.na(dtc)] <- 0
m <- as.matrix(data.frame(dtc[,-1], row.names = dtc[[1]], check.names = F))

genes <- unique(top$rn)
ann_row <- data.frame('condition' = markers[rn %in% genes, .SD[which.min(p_val)],by = rn][match(genes, rn)]$condition, 
                      #'is_nfkb_target' = as.factor(genes %in% nfkb),
                      row.names = genes)

pheatmap::pheatmap(m[genes, sample_order], scale = 'row', 
                   color = colorPal, cellwidth = 20, 
                   main = 'Ratio of cells expressing top markers', 
                   annotation_row = ann_row,
                   gaps_col = c(2, 4),
                   cluster_cols = F, 
                   fontsize = 9, cutree_rows = 3)

5 Revisiting Bulk RNAseq data

We want to apply some things learned from the single-cell data to the bulk RNA-seq data we have analyzed before.

# get RUVSeq normalized counts from bulk rnaseq data 
sample_sheet <- read.csv(file.path(datadir, 'rnaseq', 'sample_sheet.csv'))
counts <- read.table(file.path(datadir, 'rnaseq', 'feature_counts', 'raw_counts', 'hisat2', 'counts.tsv'), header = T, check.names = F)
colData <- data.frame(sample_sheet[,-1], row.names = sample_sheet$name)

set <- newSeqExpressionSet(counts = as.matrix(counts),
                           phenoData = colData)
differences <- makeGroups(colData$sample_type)
set_s <- RUVs(set, unique(rownames(set)),
              k=10, differences) #all genes

norm_counts <- log(normCounts(set_s)+1)

5.1 NFKB target genes

Make a heatmap of NFKB target genes (exclusively expressed in Cdh1+ epithelial cells only )

# subset norm_counts for nfkb genes 
ids <- readRDS(file.path(datadir, 'ensmusg2name.RDS'))
nfkb_ids <- ids[match(nfkb, name)]$id
M_nfkb <- norm_counts[nfkb_ids,]
# convert to gene names 
rownames(M_nfkb) <- ids[match(rownames(M_nfkb), id)]$name

M <- GetAssayData(seu_epi)
nonepi_cells <- colnames(seu)[seu$celltype != 'Epithelial cells']
M_nonepi <- GetAssayData(seu[,nonepi_cells])
genes <- intersect(rownames(M), rownames(M_nfkb))

# get genes expressed in at least 1% of epithelial cells
genes <- names(which(apply(M[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) > 0.5))

# remove genes expressed in more than 5% of the non epithelial cells 
genes <- names(which(apply(M_nonepi[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) < 3))

# keep activated / repressed samples only 
genes <- names(which(rowSums(M_nfkb[genes,]) > 0))

colData$nfkb <- gsub(".+_", "", colData$sample_type)
colData$dss <- gsub("_.+", "", colData$sample_type)

pheatmap::pheatmap(M_nfkb[genes, ], scale = 'row', 
                   annotation_col = colData[,c('nfkb','dss'),drop=F],
                   cluster_cols = F,
                   cellwidth = 15, 
                   color = rev(colorRampPalette(RColorBrewer::brewer.pal(11, "RdYlBu"))(10)), 
                   gaps_col = c(8, 16),
                   cutree_rows = 5, 
                   show_colnames = F)

5.2 GSEA of bulk rnaseq samples

We score bulk rnaseq samples using different gene signatures.

score_gene_sets <- function(exprs, genesets) {
  rankData <- singscore::rankGenes(exprs)
  s <- pbapply::pbsapply(genesets, function(x) {
    x <- intersect(x, rownames(rankData))
    singscore::simpleScore(rankData, upSet = x)[['TotalScore']]
  })
  rownames(s) <- colnames(exprs)
  return(s)
}

find_differential_signatures <- function(scores, samplesA, samplesB) {
  groupA <- intersect(colnames(scores), samplesA)
  groupB <- intersect(colnames(scores), samplesB)

  message("Comparing ",length(groupA), " samples (A) with ",length(groupB), " samples (B)")

  stats <- do.call(rbind, lapply(rownames(scores), function(x) {
    a <- scores[x, groupA]
    b <- scores[x, groupB]
    data.table('signature' = x, 'groupA' = mean(a),
               'groupB' = mean(b),
               'pval' = wilcox.test(a, b)[['p.value']])
  }))
  stats$padj <- p.adjust(stats$pval, method = 'BH')
  stats <- stats[order(pval)]

  return(stats)
}

gex <- data.frame(norm_counts, check.names = F)
# convert to gene names 
ids <- readRDS(file.path(datadir, 'ensmusg2name.RDS'))
gex$name <- ids[match(rownames(gex), id)]$name
gex <- gex[!is.na(gex$name),]
gex <- gex[!duplicated(gex$name),]
rownames(gex) <- gex$name
gex$name <- NULL

5.3 EGFP+ markers

For each condition, we found the signature genes that distinguish egfp+ from egfp- cells. Now, we score the bulk samples by these signatures.

m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:51], by = .(condition)][rn != 'Egfp']

egfp_signatures <- sapply(simplify = F, unique(m$condition), function(x) m[condition == x]$rn)

egfp_signature_scores <- score_gene_sets(gex, egfp_signatures)
egfp_signature_scores <- egfp_signature_scores[,c('Untreated', 'DSS', 'Recovery')]

pheatmap::pheatmap(t(egfp_signature_scores), cluster_cols = F, 
                   scale = 'row',
                   annotation_col = colData[,c('nfkb','dss'),drop=F],
                   cellwidth = 15,  cellheight = 10,
                   color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10),                    cluster_rows = F, 
                   show_colnames = F, 
                   gaps_col = c(8, 16), 
                   main = 'EGFP+ condition signature scores in bulk rnaseq')

5.4 Immune Cell type markers

xcell <- readRDS(file.path(datadir, 'xCell_signatures.mouse_genenames.RDS'))

# immune cell groups from xcell 
immune_cells <- c("B-cells", "CD4+ T-cells", 
                  "CD8+ T-cells", "DC", "Eosinophils", "Macrophages", "Monocytes", 
                  "Mast cells", "Neutrophils", "NK cells")
xcell_scores <- score_gene_sets(gex, xcell[immune_cells])

pheatmap::pheatmap(t(xcell_scores), scale = 'row',
                   annotation_col = colData[,c('nfkb','dss'),drop=F],
                   cellwidth = 15, cellheight = 10,
                   show_colnames = F,
                   cluster_cols = F,
                   gaps_col = c(8, 16), 
                   color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10),
                   main = 'Immune Cell Type Signature Scores in Bulk RNAseq')

6 SessionInfo

print(sessionInfo())
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] Seurat_5.0.0                SeuratObject_5.1.0         
##  [3] sp_2.2-0                    RUVSeq_1.42.0              
##  [5] edgeR_4.6.2                 limma_3.64.1               
##  [7] EDASeq_2.42.0               ShortRead_1.66.0           
##  [9] GenomicAlignments_1.44.0    SummarizedExperiment_1.38.1
## [11] MatrixGenerics_1.20.0       matrixStats_1.5.0          
## [13] Rsamtools_2.24.0            GenomicRanges_1.60.0       
## [15] Biostrings_2.76.0           GenomeInfoDb_1.44.0        
## [17] XVector_0.48.0              IRanges_2.42.0             
## [19] S4Vectors_0.46.0            BiocParallel_1.42.0        
## [21] Biobase_2.68.0              BiocGenerics_0.54.0        
## [23] generics_0.1.4              ggridges_0.5.6             
## [25] ggrepel_0.9.6               stringr_1.5.1              
## [27] gprofiler2_0.2.3            ggpubr_0.6.0               
## [29] ggplot2_3.5.2               pheatmap_1.0.12            
## [31] data.table_1.17.4           openxlsx_4.2.8             
## [33] rmarkdown_2.29             
## 
## loaded via a namespace (and not attached):
##   [1] spatstat.sparse_3.1-0   bitops_1.0-9            httr_1.4.7             
##   [4] RColorBrewer_1.1-3      tools_4.5.0             sctransform_0.4.2      
##   [7] backports_1.5.0         R6_2.6.1                lazyeval_0.2.2         
##  [10] uwot_0.2.3              withr_3.0.2             prettyunits_1.2.0      
##  [13] gridExtra_2.3           progressr_0.15.1        cli_3.6.5              
##  [16] spatstat.explore_3.4-3  fastDummies_1.7.5       labeling_0.4.3         
##  [19] sass_0.4.10             spatstat.data_3.1-6     pbapply_1.7-2          
##  [22] R.utils_2.13.0          parallelly_1.44.0       RSQLite_2.3.11         
##  [25] BiocIO_1.18.0           hwriter_1.3.2.1         ica_1.0-3              
##  [28] spatstat.random_3.4-1   car_3.1-3               dplyr_1.1.4            
##  [31] zip_2.3.3               Matrix_1.7-3            interp_1.1-6           
##  [34] abind_1.4-8             R.methodsS3_1.8.2       lifecycle_1.0.4        
##  [37] yaml_2.3.10             carData_3.0-5           SparseArray_1.8.0      
##  [40] BiocFileCache_2.16.0    Rtsne_0.17              grid_4.5.0             
##  [43] blob_1.2.4              promises_1.3.2          crayon_1.5.3           
##  [46] pwalign_1.4.0           miniUI_0.1.2            lattice_0.22-6         
##  [49] cowplot_1.1.3           annotate_1.86.0         GenomicFeatures_1.60.0 
##  [52] KEGGREST_1.48.0         pillar_1.10.2           knitr_1.50             
##  [55] rjson_0.2.23            future.apply_1.11.3     codetools_0.2-20       
##  [58] leiden_0.4.3.1          glue_1.8.0              spatstat.univar_3.1-3  
##  [61] vctrs_0.6.5             png_0.1-8               spam_2.11-1            
##  [64] gtable_0.3.6            cachem_1.1.0            aroma.light_3.38.0     
##  [67] xfun_0.52               S4Arrays_1.8.0          mime_0.13              
##  [70] survival_3.8-3          statmod_1.5.0           fitdistrplus_1.2-2     
##  [73] ROCR_1.0-11             nlme_3.1-168            bit64_4.6.0-1          
##  [76] progress_1.2.3          filelock_1.0.3          RcppAnnoy_0.0.22       
##  [79] bslib_0.9.0             irlba_2.3.5.1           KernSmooth_2.23-26     
##  [82] DBI_1.2.3               tidyselect_1.2.1        bit_4.6.0              
##  [85] compiler_4.5.0          curl_6.2.3              graph_1.86.0           
##  [88] httr2_1.1.2             xml2_1.3.8              DelayedArray_0.34.1    
##  [91] plotly_4.10.4           rtracklayer_1.68.0      scales_1.4.0           
##  [94] lmtest_0.9-40           rappdirs_0.3.3          digest_0.6.37          
##  [97] goftest_1.2-3           spatstat.utils_3.1-4    presto_1.0.0           
## [100] htmltools_0.5.8.1       pkgconfig_2.0.3         jpeg_0.1-11            
## [103] dbplyr_2.5.0            fastmap_1.2.0           rlang_1.1.6            
## [106] htmlwidgets_1.6.4       UCSC.utils_1.4.0        shiny_1.10.0           
## [109] farver_2.1.2            jquerylib_0.1.4         zoo_1.8-14             
## [112] jsonlite_2.0.0          R.oo_1.27.1             RCurl_1.98-1.17        
## [115] magrittr_2.0.3          Formula_1.2-5           GenomeInfoDbData_1.2.14
## [118] dotCall64_1.2           patchwork_1.3.0         Rcpp_1.0.14            
## [121] reticulate_1.42.0       stringi_1.8.7           MASS_7.3-65            
## [124] plyr_1.8.9              parallel_4.5.0          listenv_0.9.1          
## [127] deldir_2.0-4            splines_4.5.0           tensor_1.5             
## [130] hms_1.1.3               locfit_1.5-9.12         igraph_2.1.4           
## [133] spatstat.geom_3.4-1     ggsignif_0.6.4          RcppHNSW_0.6.0         
## [136] reshape2_1.4.4          biomaRt_2.64.0          XML_3.99-0.18          
## [139] evaluate_1.0.3          latticeExtra_0.6-30     httpuv_1.6.16          
## [142] RANN_2.6.2              tidyr_1.3.1             purrr_1.0.4            
## [145] polyclip_1.10-7         future_1.49.0           scattermore_1.2        
## [148] broom_1.0.8             xtable_1.8-4            restfulr_0.0.15        
## [151] RSpectra_0.16-2         rstatix_0.7.2           later_1.4.2            
## [154] viridisLite_0.4.2       singscore_1.28.0        tibble_3.2.1           
## [157] memoise_2.0.1           AnnotationDbi_1.70.0    cluster_2.1.8.1        
## [160] globals_0.18.0          GSEABase_1.70.0